#coding=utf-8
from bs4 import BeautifulSoup
from urllib import urlopen
from urlparse import urljoin
import re

def urlget(url):
    '''use urllib to retrieve file and decode,return unicode object'''
    try:
        data=urlopen(url).read().decode(enc,errors='ignore')
    except:
        print "connection error during get:"+url
    else:
        return data

def import_css(css_url,contents):
    '''merge css imported by the @import url() method'''
    found=re.findall(r'^\s*@import url\([\'"]?(.*)[\'"]?\)',contents,re.M)
    if not found:
        return contents
    else:
        for x in found:
            if re.search(r'http://',x,re.I):
                cssfile=x
            else:
                cssfile=urljoin(css_url,x)
            css=urlget(cssfile)
            contents=re.sub(r'.*@import url.*'+x+r'.*',css,contents)
        return contents

def merge_css():
    '''merge css to html'''
    linked_css=bs.findAll(name="link",rel="stylesheet",href=True)
    for css in linked_css:
        cssfile=urljoin(url,css["href"])
        contents=urlget(cssfile)
        #check if there are any other css files imported by @import url() method
        css.string=import_css(cssfile,contents)
        css.name="style"
        css.attrs={"type":"text/css"}

def merge_js():
    '''merge javascript to html'''
    linked_js=bs.findAll(name="script",type="text/javascript",src=True)
    for js in linked_js:
        jsfile=urljoin(url,js["src"])
        contents=urlget(jsfile)
        del js["src"]
        js.string=contents

def remove_js():
    linked_js=bs.findAll(name="script",type="text/javascript",src=True)
    for js in linked_js:
        js.decompose()

#url="http://localhost/doc/bs4.htm"
url="http://crummy.com/software/BeautifulSoup/bs4/doc/"
#url="http://www.ibm.com/developerworks/cn/aix/library/au-zodb/"

html=urlopen(url)
bs=BeautifulSoup(html)
if bs.original_encoding=='gb2312':
    enc='gbk'
else:
    enc=bs.original_encoding

merge_css()
#merge_js()
remove_js()

#use html <Title> as filename,and remove illegal char
filename=re.sub(r'[/\:*"<>|\r\n]','',bs.title.string).rstrip()+".htm"
output=open(filename,'wb')
#output.write(bs.prettify(encoding=enc))
output.write(bs.encode(enc))
output.close()
print 'done'